In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import statsmodels.api as sm
from statsmodels.formula.api import ols

Load the data

In [2]:
crime = pd.read_csv('formatted.csv', sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])
In [3]:
crime.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2218 entries, 0 to 2217
Columns: 151 entries, Unnamed: 0 to Division
dtypes: float64(145), int64(1), object(5)
memory usage: 2.6+ MB
In [4]:
crime.head()
Out[4]:
Unnamed: 0 communityName statecode countyCode communityCode fold population householdsize racepctblack racePctWhite ... larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop State Region Division
0 0 BerkeleyHeightstownship NJ 39.0 5320.0 1.0 11980.0 3.10 1.37 91.78 ... 1132.08 16.0 131.26 2.0 16.41 41.02 1394.59 New Jersey Northeast Middle Atlantic
1 1 Bricktownship NJ 29.0 7420.0 1.0 66473.0 2.66 0.63 97.81 ... 1773.32 98.0 143.15 14.0 20.45 131.47 2543.13 New Jersey Northeast Middle Atlantic
2 2 ScotchPlainstownship NJ 39.0 66060.0 1.0 21160.0 2.79 11.10 83.79 ... 1271.19 89.0 409.91 2.0 9.21 115.14 2160.10 New Jersey Northeast Middle Atlantic
3 3 Gallowaytownship NJ 1.0 25560.0 1.0 23330.0 2.94 7.36 88.58 ... 2332.30 46.0 192.61 7.0 29.31 251.24 3668.03 New Jersey Northeast Middle Atlantic
4 4 NewProvidenceborough NJ 39.0 51810.0 1.0 11439.0 2.70 0.54 94.18 ... 768.21 18.0 148.69 2.0 16.52 49.56 1032.55 New Jersey Northeast Middle Atlantic

5 rows × 151 columns

In [5]:
crime.columns
Out[5]:
Index(['Unnamed: 0', 'communityName', 'statecode', 'countyCode',
       'communityCode', 'fold', 'population', 'householdsize', 'racepctblack',
       'racePctWhite',
       ...
       'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop',
       'ViolentCrimesPerPop', 'nonViolPerPop', 'State', 'Region', 'Division'],
      dtype='object', length=151)

Explore the data: Histograms

Check out min, mx, mean, median, and std for non-violent crimes

In [6]:
print('Minimum for violent crimes: ', crime['ViolentCrimesPerPop'].min())
print('Maximum for violent crimes: ', crime['ViolentCrimesPerPop'].max())
print('Mean for violent crimes: ', crime['ViolentCrimesPerPop'].mean())
print('STD for violent crimes: ', crime['ViolentCrimesPerPop'].std())
print('Median for violent crimes: ', crime['ViolentCrimesPerPop'].median())
Minimum for violent crimes:  0.0
Maximum for violent crimes:  4877.06
Mean for violent crimes:  589.0789217652957
STD for violent crimes:  614.7845182453359
Median for violent crimes:  374.06

Check out min, mx, mean, median, and std for non-violent crimes

In [7]:
print('Minimum for non-violent crimes: ', crime['nonViolPerPop'].min())
print('Maximum for non-violent crimes: ', crime['nonViolPerPop'].max())
print('Mean for non-violent crimes: ', crime['nonViolPerPop'].mean())
print('STD for non-violent crimes: ', crime['nonViolPerPop'].std())
print('Median for non-violent crimes: ', crime['nonViolPerPop'].median())
Minimum for non-violent crimes:  116.79
Maximum for non-violent crimes:  27119.76
Mean for non-violent crimes:  4908.241803588295
STD for non-violent crimes:  2739.7089005280213
Median for non-violent crimes:  4425.450000000001

Violent crime density

In [8]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('Violent Crimes density')
crime['ViolentCrimesPerPop'].hist()
plt.xlabel('Number of Violent Crimes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('Murder Crimes density')
crime['murdPerPop'].hist()
plt.xlabel('Number of Murders per 100k population')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('Rape Crimes density')
crime['rapesPerPop'].hist()
plt.xlabel('Number of Rapes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,4)

plt.title('Robbery Crimes density')
crime['robbbPerPop'].hist()
plt.xlabel('Number of Robberies per 100K population')
plt.ylabel('count')

plt.subplot(3,2,5)
plt.title('Assault Crimes density')
crime['assaultPerPop'].hist()
plt.xlabel('Number of Assaults per 100k population')
plt.ylabel('count')
Out[8]:
Text(0, 0.5, 'count')

Non-violent crime density

In [9]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('Non-Violent Crimes density')
crime['nonViolPerPop'].hist()
plt.xlabel('Number of Non-Violent Crimes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('Arson Crimes density')
crime['arsonsPerPop'].hist()
plt.xlabel('Arsons density')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('Burglary Crimes density')
crime['burglPerPop'].hist()
plt.xlabel('Burglaries density')
plt.ylabel('count')

plt.subplot(3,2,4)
plt.title('Larceny Crimes density')
crime['larcPerPop'].hist()
plt.xlabel('Larcenies density')
plt.ylabel('Counts')

plt.subplot(3,2,5)
plt.title('Auto-theft Crimes density')
crime['autoTheftPerPop'].hist()
plt.xlabel('Auto thefts density')
plt.ylabel('Counts')
Out[9]:
Text(0, 0.5, 'Counts')

This is a smart way of visualizing density for each type of crime and see which one is most widespread

Check per capita income for each race

In [10]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('whitePerCap')
crime['whitePerCap'].hist()
plt.xlabel('Per capita income for caucasians')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('blackPerCap')
crime['blackPerCap'].hist()
plt.xlabel('Per capita income for blacks')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('indianPerCap')
crime['indianPerCap'].hist()
plt.xlabel('Per capita income for indians')
plt.ylabel('count')

plt.subplot(3,2,4)
plt.title('AsianPerCap')
crime['AsianPerCap'].hist()
plt.xlabel('Per capita income for asians')
plt.ylabel('count')

plt.subplot(3,2,5)
plt.title('HispPerCap')
crime['HispPerCap'].hist()
plt.xlabel('Per capita income for hispanics')
plt.ylabel('count')
Out[10]:
Text(0, 0.5, 'count')

Exploring data through linear regression plots

Plot linear regression plots for %age and non-violent crimes

In [11]:
sns.lmplot(x='agePct12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct65up', y='nonViolPerPop', data=crime)
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x216a388bf10>
In [14]:
# calculate correlation coeffcient between %age and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', \
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['nonViolPerPop'])[0])
Correlation coefficient for ages 12-21:  0.023780025891635356
Correlation coefficient for ages 12-29:  0.11128019835681058
Correlation coefficient for ages 16-24:  0.06647804421706746
Correlation coefficient for ages 65+:  0.126582344754042

Plot linear regression plots for population age and non-violent crimes

In [15]:
crime['agePop12t21'] = crime['agePct12t21'] * crime['population']
crime['agePop12t29'] = crime['agePct12t29'] * crime['population']
crime['agePop16t24'] = crime['agePct16t24'] * crime['population']
crime['agePop65up'] = crime['agePct65up'] * crime['population']
In [16]:
sns.lmplot(x='agePop12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop65up', y='nonViolPerPop', data=crime)
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x216a344cbb0>
In [17]:
# calculate correlation coefficient between population age and non-violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['nonViolPerPop'])[0])
Correlation coefficient for ages 12-21:  0.1268336759071523
Correlation coefficient for ages 12-29:  0.1252028812333492
Correlation coefficient for ages 16-24:  0.13008854745491247
Correlation coefficient for ages 65+:  0.12455724077253921

Plot linear regression for % age and violent crimes

In [18]:
sns.lmplot(x='agePct12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct65up', y='ViolentCrimesPerPop', data=crime)
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x216a446c160>
In [19]:
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for ages 12-21:  0.02202395356565119
Correlation coefficient for ages 12-29:  0.1099396079134631
Correlation coefficient for ages 16-24:  0.04841759061410261
Correlation coefficient for ages 65+:  0.05396574839211378

Plot linear regression plots for population age and violent crimes

In [20]:
sns.lmplot(x='agePop12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop65up', y='ViolentCrimesPerPop', data=crime)
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x216a3bc8b50>
In [21]:
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for ages 12-21:  0.22012359052809355
Correlation coefficient for ages 12-29:  0.2188657592482586
Correlation coefficient for ages 16-24:  0.22264431466013046
Correlation coefficient for ages 65+:  0.2117114786134144

Plot linear regression plots for %education and non-violent crimes

In [22]:
sns.lmplot(x='PctLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='nonViolPerPop', data=crime)
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x216a3aedb50>
In [25]:
# calculate correlation coeffcient between %education and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctLess9thGrade:  0.28784927687473016
Correlation coefficient for PctNotHSGrad:  0.3665001575364964
Correlation coefficient for PctBSorMore:  -0.27101682578840336

Plot linear regression plots for population education and non-violent crimes

In [26]:
crime['PopLess9thGrade'] = crime['PctLess9thGrade'] * crime['population']
crime['PopNotHSGrad'] = crime['PctNotHSGrad'] * crime['population']
crime['PopBSorMore'] = crime['PctBSorMore'] * crime['population']
In [27]:
sns.lmplot(x='PopLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='nonViolPerPop', data=crime)
Out[27]:
<seaborn.axisgrid.FacetGrid at 0x216a34cf400>
In [28]:
# calculate correlation coeffcient between population education and non-violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopLess9thGrade:  0.11147481298678563
Correlation coefficient for PopNotHSGrad:  0.11912455062613182
Correlation coefficient for PopBSorMore:  0.09851144679694106

Plot linear regression plots for %education and non-violent crimes

In [29]:
sns.lmplot(x='PctLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='ViolentCrimesPerPop', data=crime)
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x216a5423c70>
In [30]:
# calculate correlation coeffcient between %education and violent crimes
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctLess9thGrade:  0.37080716309505024
Correlation coefficient for PctNotHSGrad:  0.46651461611308775
Correlation coefficient for PctBSorMore:  -0.29929005457851565

Plot linear regression plots for population education and violent crimes

In [31]:
sns.lmplot(x='PopLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='ViolentCrimesPerPop', data=crime)
Out[31]:
<seaborn.axisgrid.FacetGrid at 0x216a5459f70>
In [32]:
# calculate correlation coeffcient between population education and violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopLess9thGrade:  0.21514133095190566
Correlation coefficient for PopNotHSGrad:  0.22474245308421073
Correlation coefficient for PopBSorMore:  0.1788314876881413

Plot linear regression plots for %employment/unemployment and non-violent crimes

In [33]:
sns.lmplot(x='PctEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='nonViolPerPop', data=crime)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x216a5d7b130>
In [36]:
# calculate correlation coefficient between %employed/unemployed and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctUnemployed:  0.39208500191553797
Correlation coefficient for PctEmploy:  -0.3047104919359426

Plot linear regression plots for population employment/unemploymet and non-violent crimes

In [37]:
crime['PopEmploy'] = crime['PctEmploy'] * crime['population']
crime['PopUnemployed'] = crime['PctUnemployed'] * crime['population']
In [38]:
sns.lmplot(x='PopEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='nonViolPerPop', data=crime)
Out[38]:
<seaborn.axisgrid.FacetGrid at 0x216a5dbaac0>
In [39]:
# Calculate correlation coefficient for population employed/unemployed and non-violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopUnemployed:  0.1217417798243135
Correlation coefficient for PopEmploy:  0.11749062427354053

Plot linear regression plots for %employment/unemployment and violent crimes

In [40]:
sns.lmplot(x='PctEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='ViolentCrimesPerPop', data=crime)
Out[40]:
<seaborn.axisgrid.FacetGrid at 0x216a5d73a60>
In [41]:
# Calculate correlation coefficient for %employed/unemployed and violent crimes
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctUnemployed:  0.4749680398078536
Correlation coefficient for PctEmploy:  -0.31226118672258446

Plot linear regression plots for population employment/unemploymet and violent crimes

In [42]:
sns.lmplot(x='PopEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='ViolentCrimesPerPop', data=crime)
Out[42]:
<seaborn.axisgrid.FacetGrid at 0x216a5423460>
In [43]:
# Calculate correlation coefficient for population employed/unemployed and violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopUnemployed:  0.22277532593518973
Correlation coefficient for PopEmploy:  0.20954788198015384

Plot linear regression plots for %vacancy and non-violent crimes

In [44]:
sns.lmplot(x='PctHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='nonViolPerPop', data=crime)
Out[44]:
<seaborn.axisgrid.FacetGrid at 0x216a3f3aa60>
In [47]:
# calculate correlation coefficient between %vacancy and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                              'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctHousOccup:  -0.3039032395515145
Correlation coefficient for PctHousOwnOcc:  -0.4622358628933084
Correlation coefficient for PctVacantBoarded:  0.32367867144782136
Correlation coefficient for PctVacMore6Mos:  -0.04302596621892053

Plot linear regression plots for population vacancy and non-violent crimes

In [48]:
crime['PopHousOccup'] = crime['PctHousOccup'] * crime['population']
crime['PopHousOwnOcc'] = crime['PctHousOwnOcc'] * crime['population']
crime['PopVacantBoarded'] = crime['PctVacantBoarded'] * crime['population']
crime['PopVacMore6Mos'] = crime['PctVacMore6Mos'] * crime['population']
In [49]:
sns.lmplot(x='PopHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='nonViolPerPop', data=crime)
Out[49]:
<seaborn.axisgrid.FacetGrid at 0x216a4069af0>
In [50]:
# calculate correlation coefficient for population vacancy and non-violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopHousOccup:  0.1133129368532727
Correlation coefficient for PopHousOwnOcc:  0.13714579037718935
Correlation coefficient for PopVacantBoarded:  0.1531535526401337
Correlation coefficient for PopVacMore6Mos:  0.10825444676130508

Plot linear regression plots for %vacancy and non-violent crimes

In [51]:
sns.lmplot(x='PctHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
Out[51]:
<seaborn.axisgrid.FacetGrid at 0x216a3c92340>
In [52]:
# calculate correlation coefficient between %vacancy and violent crimes
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctHousOccup:  -0.25554595819128345
Correlation coefficient for PctHousOwnOcc:  -0.46069357769159813
Correlation coefficient for PctVacantBoarded:  0.47510410552705856
Correlation coefficient for PctVacMore6Mos:  0.017526764073398652

Plot linear regression plots for population vacancy and non-violent crimes

In [53]:
sns.lmplot(x='PopHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
Out[53]:
<seaborn.axisgrid.FacetGrid at 0x216a4fa8a00>
In [54]:
# calculate correlatiion coefficient between population vacancy and violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopHousOccup:  0.2062343596261637
Correlation coefficient for PopHousOwnOcc:  0.2318456195991899
Correlation coefficient for PopVacantBoarded:  0.2698337506115343
Correlation coefficient for PopVacMore6Mos:  0.1995592330118498

Plot linear regression plot for % race and non-violent crimes

In [59]:
sns.lmplot(x='racepctblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='nonViolPerPop', data=crime)
Out[59]:
<seaborn.axisgrid.FacetGrid at 0x216a4fe24c0>
In [60]:
# calculate correlation coefficient between %race and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                              'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
                              'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                              'racepopblack', 'racePopWhite', 'racePopAsian', 'racePopHisp',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['nonViolPerPop'])[0])
Correlation coefficient for pctraceblack:  0.4743247060336021
Correlation coefficient for pctRaceWhite:  -0.4765791610681369
Correlation coefficient for pctRaceAsian:  -0.03474179713723831
Correlation coefficient for pctRaceHisp:  0.17462237036514378

Plot linear regression plots for population race and non-violent crimes

In [61]:
crime['racepopblack'] = crime['racepctblack'] * crime['population']
crime['racePopWhite'] = crime['racePctWhite'] * crime['population']
crime['racePopAsian'] = crime['racePctAsian'] * crime['population']
crime['racePopHisp'] = crime['racePctHisp'] * crime['population']
In [62]:
sns.lmplot(x='racepopblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='nonViolPerPop', data=crime)
Out[62]:
<seaborn.axisgrid.FacetGrid at 0x216a4165220>
In [63]:
# calculate correlation coefficient for population race and non-violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['nonViolPerPop'])[0])
Correlation coefficient for popraceblack:  0.1381219744795777
Correlation coefficient for popRaceWhite:  0.12002428600448703
Correlation coefficient for popRaceAsian:  0.054190814071591376
Correlation coefficient for popRaceHisp:  0.08281266109978011

Plot linear regression plot for %race and violent crimes

In [64]:
sns.lmplot(x='racepctblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='ViolentCrimesPerPop', data=crime)
Out[64]:
<seaborn.axisgrid.FacetGrid at 0x216a34421f0>
In [65]:
# calculate correlation coefficient for %race and violent crimes
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for pctraceblack:  0.6238334896507505
Correlation coefficient for pctRaceWhite:  -0.676357463352348
Correlation coefficient for pctRaceAsian:  0.03604447688047006
Correlation coefficient for pctRaceHisp:  0.26451715732322045

Plot linear regression plot for population race and violent crimes

In [66]:
sns.lmplot(x='racepopblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='ViolentCrimesPerPop', data=crime)
Out[66]:
<seaborn.axisgrid.FacetGrid at 0x216a5e1bfd0>
In [67]:
# calculate correlation coefficient for population race and violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for popraceblack:  0.2614411462973624
Correlation coefficient for popRaceWhite:  0.1910352609128278
Correlation coefficient for popRaceAsian:  0.14443796853542853
Correlation coefficient for popRaceHisp:  0.16900747491190227

Plot linear regression plots for violent crimes vs. non-violent crimes by region

In [68]:
sns.lmplot(x='ViolentCrimesPerPop', y='nonViolPerPop', data=crime,
           fit_reg=True, #  regression line
           hue='Region',x_jitter=.1, y_jitter=0.1)   # Color by Region
Out[68]:
<seaborn.axisgrid.FacetGrid at 0x216a4178100>

Correlation matrix and Multiple Linear Regression model

Plot heatmap

In [69]:
crime.corr()
Out[69]:
Unnamed: 0 countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp ... PopEmploy PopUnemployed PopHousOccup PopHousOwnOcc PopVacantBoarded PopVacMore6Mos racepopblack racePopWhite racePopAsian racePopHisp
Unnamed: 0 1.000000 0.330027 0.098597 0.026590 0.002961 -0.085890 0.168589 -0.093145 -0.118527 -0.098382 ... 0.003722 0.000673 0.001322 0.015846 0.007857 -0.003181 0.019008 0.006031 -0.033660 -0.031724
countyCode 0.330027 1.000000 0.124997 -0.060262 0.080867 -0.032992 0.219294 -0.173580 -0.085545 -0.088787 ... 0.087084 0.049108 0.082187 0.093037 0.046351 0.062575 0.091435 0.079932 0.028103 -0.013803
communityCode 0.098597 0.124997 1.000000 0.004526 -0.034680 0.002698 -0.013897 0.014749 0.033570 0.000687 ... -0.030735 -0.049584 -0.034231 -0.019389 -0.040014 -0.037967 -0.050314 -0.018727 -0.005206 -0.026689
fold 0.026590 -0.060262 0.004526 1.000000 -0.044338 0.015973 -0.040064 0.022973 0.004439 0.035620 ... -0.043224 -0.045374 -0.044404 -0.039704 -0.048556 -0.048266 -0.047387 -0.039264 -0.042699 -0.036323
population 0.002961 0.080867 -0.034680 -0.044338 1.000000 -0.018841 0.135641 -0.184685 0.088360 0.094048 ... 0.997420 0.980257 0.999602 0.960874 0.817184 0.969573 0.920310 0.985310 0.881461 0.915035
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PopVacMore6Mos -0.003181 0.062575 -0.037967 -0.048266 0.969573 -0.028066 0.162980 -0.180756 0.037435 0.062449 ... 0.954669 0.974549 0.967702 0.922105 0.878518 1.000000 0.961107 0.941060 0.794884 0.836116
racepopblack 0.019008 0.091435 -0.050314 -0.047387 0.920310 -0.025811 0.261235 -0.252400 0.023510 0.036930 ... 0.899555 0.956437 0.917470 0.873860 0.906417 0.961107 1.000000 0.856934 0.722720 0.754417
racePopWhite 0.006031 0.079932 -0.018727 -0.039264 0.985310 -0.037977 0.086765 -0.137822 0.087790 0.089963 ... 0.989585 0.941981 0.984765 0.963334 0.770024 0.941060 0.856934 1.000000 0.865640 0.901571
racePopAsian -0.033660 0.028103 -0.005206 -0.042699 0.881461 0.022648 0.043137 -0.162957 0.277567 0.108268 ... 0.888019 0.833330 0.888139 0.809605 0.564153 0.794884 0.722720 0.865640 1.000000 0.879739
racePopHisp -0.031724 -0.013803 -0.026689 -0.036323 0.915035 0.062769 0.049429 -0.157381 0.095645 0.216809 ... 0.918027 0.890480 0.917507 0.849623 0.626975 0.836116 0.754417 0.901571 0.879739 1.000000

163 rows × 163 columns

A lot of variables are present. Will tidy this up by subsetting columns into a new dataframe

In [70]:
# will leave out age group from this subset, since they did not have strong correlation
crimedata = crime[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'nonViolPerPop', 'ViolentCrimesPerPop']]
In [71]:
crimedata.corr()
Out[71]:
PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos racepctblack racePctWhite racePctAsian racePctHisp nonViolPerPop ViolentCrimesPerPop
PctLess9thGrade 1.000000 0.927560 -0.577203 0.657108 -0.531317 -0.144882 -0.358801 0.322437 0.209439 0.244873 -0.458497 -0.109690 0.635955 0.303679 0.371422
PctNotHSGrad 0.927560 1.000000 -0.751544 0.724004 -0.617251 -0.207866 -0.378211 0.416527 0.283349 0.367293 -0.494350 -0.182692 0.493895 0.387613 0.467596
PctBSorMore -0.577203 -0.751544 1.000000 -0.545808 0.393518 0.179709 0.190965 -0.296578 -0.220493 -0.188492 0.217388 0.262881 -0.245779 -0.282542 -0.299898
PctUnemployed 0.657108 0.724004 -0.545808 1.000000 -0.676430 -0.261544 -0.394427 0.549636 0.299310 0.441598 -0.539588 -0.133305 0.416788 0.408442 0.483441
PctEmploy -0.531317 -0.617251 0.393518 -0.676430 1.000000 0.341939 0.236908 -0.342344 -0.372459 -0.298779 0.282903 0.195498 -0.161893 -0.329996 -0.317644
PctHousOccup -0.144882 -0.207866 0.179709 -0.261544 0.341939 1.000000 0.171256 -0.182542 -0.274189 -0.204498 0.153423 0.177288 -0.073658 -0.309280 -0.256836
PctHousOwnOcc -0.358801 -0.378211 0.190965 -0.394427 0.236908 0.171256 1.000000 -0.221876 0.138628 -0.345849 0.449833 -0.078754 -0.251056 -0.466257 -0.455359
PctVacantBoarded 0.322437 0.416527 -0.296578 0.549636 -0.342344 -0.182542 -0.221876 1.000000 0.366664 0.521610 -0.487907 -0.113272 0.151015 0.343413 0.479910
PctVacMore6Mos 0.209439 0.283349 -0.220493 0.299310 -0.372459 -0.274189 0.138628 0.366664 1.000000 0.190708 -0.033292 -0.323736 -0.122969 -0.017010 0.030769
racepctblack 0.244873 0.367293 -0.188492 0.441598 -0.298779 -0.204498 -0.345849 0.521610 0.190708 1.000000 -0.820605 -0.089300 -0.063911 0.484853 0.628368
racePctWhite -0.458497 -0.494350 0.217388 -0.539588 0.282903 0.153423 0.449833 -0.487907 -0.033292 -0.820605 1.000000 -0.276474 -0.408489 -0.487033 -0.676849
racePctAsian -0.109690 -0.182692 0.262881 -0.133305 0.195498 0.177288 -0.078754 -0.113272 -0.323736 -0.089300 -0.276474 1.000000 0.198439 -0.037223 0.031949
racePctHisp 0.635955 0.493895 -0.245779 0.416788 -0.161893 -0.073658 -0.251056 0.151015 -0.122969 -0.063911 -0.408489 0.198439 1.000000 0.174438 0.253596
nonViolPerPop 0.303679 0.387613 -0.282542 0.408442 -0.329996 -0.309280 -0.466257 0.343413 -0.017010 0.484853 -0.487033 -0.037223 0.174438 1.000000 0.675374
ViolentCrimesPerPop 0.371422 0.467596 -0.299898 0.483441 -0.317644 -0.256836 -0.455359 0.479910 0.030769 0.628368 -0.676849 0.031949 0.253596 0.675374 1.000000

Looks much better

In [72]:
# load dataframe into a variable
crimedata_corr = crimedata.corr()
In [73]:
# make the correlation matrix plot
plt.figure(figsize=(12,10))
sns.heatmap(crimedata_corr,annot=True,vmin=-1.0)
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0x216a5eb1850>

This is so much concise and takes less time to generate. Correlation coefficients can be easily viewed for any pair of variables

In [74]:
sns.pairplot(crimedata)
Out[74]:
<seaborn.axisgrid.PairGrid at 0x216a5f0c940>

Multiple Linear Regression

Dependent Variable: ViolentCrimesPerPop

In [80]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing

# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'nonViolPerPop']]
y = crime1['ViolentCrimesPerPop']

# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
In [81]:
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(1426, 14) (1426,)
(476, 14) (476,)
In [82]:
# fit a model
lm = linear_model.LinearRegression(normalize=True)

# train model on test data
model = lm.fit(X_train, y_train)

# Evaluate model
print(model.score(X_train, y_train))

# Use model to make predictions
predictions = lm.predict(X_test)

# Calculate coeffcient and intercept
coefficients = model.coef_
intercepts = model.intercept_
0.6672082286337916
In [83]:
# Use statsmodel
X = sm.add_constant(X)
model2 = sm.OLS(y_train,X_train).fit()
print (model2.summary())
                                 OLS Regression Results                                 
========================================================================================
Dep. Variable:     ViolentCrimesPerPop   R-squared (uncentered):                   0.825
Model:                             OLS   Adj. R-squared (uncentered):              0.823
Method:                  Least Squares   F-statistic:                              475.1
Date:                 Fri, 09 Oct 2020   Prob (F-statistic):                        0.00
Time:                         17:11:44   Log-Likelihood:                         -10361.
No. Observations:                 1426   AIC:                                  2.075e+04
Df Residuals:                     1412   BIC:                                  2.082e+04
Df Model:                           14                                                  
Covariance Type:             nonrobust                                                  
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
PctLess9thGrade    -44.6364      5.250     -8.502      0.000     -54.936     -34.337
PctNotHSGrad        36.2436      3.897      9.299      0.000      28.598      43.889
PctBSorMore          6.3628      1.416      4.493      0.000       3.585       9.141
PctUnemployed        2.6276      6.289      0.418      0.676      -9.710      14.965
PctEmploy            4.8657      1.673      2.908      0.004       1.583       8.148
PctHousOccup        -3.5006      1.919     -1.824      0.068      -7.265       0.264
PctHousOwnOcc       -1.8796      0.831     -2.261      0.024      -3.510      -0.249
PctVacantBoarded    22.9530      3.607      6.363      0.000      15.877      30.029
PctVacMore6Mos      -1.6623      0.833     -1.994      0.046      -3.297      -0.027
racepctblack         9.1377      2.082      4.389      0.000       5.053      13.222
racePctWhite        -4.6117      1.799     -2.563      0.010      -8.141      -1.083
racePctAsian         5.3465      3.036      1.761      0.078      -0.609      11.302
racePctHisp          5.7942      1.321      4.385      0.000       3.202       8.386
nonViolPerPop        0.0776      0.004     18.508      0.000       0.069       0.086
==============================================================================
Omnibus:                      400.287   Durbin-Watson:                   2.035
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2305.995
Skew:                           1.179   Prob(JB):                         0.00
Kurtosis:                       8.766   Cond. No.                     4.08e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.08e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [84]:
# show values
print("The coeffcient of our model is: ", coefficients[0])
print("The intercept for our model is: ", intercepts)
print ("Linear model Train dataset score is: ", model.score(X_train,y_train))
print ("Linear model Test dataset score is: ", model.score(X_test,y_test))
The coeffcient of our model is:  -38.65614796641976
The intercept for our model is:  1525.2737559291945
Linear model Train dataset score is:  0.6672082286337916
Linear model Test dataset score is:  0.6479427578855033

Dpendent Variable: NonViolCrime

In [85]:
# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'ViolentCrimesPerPop']]
y = crime1['nonViolPerPop']

# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
In [86]:
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(1426, 14) (1426,)
(476, 14) (476,)
In [88]:
# fit a model
lm1 = linear_model.LinearRegression(normalize=True)

# train model on test data
model3 = lm1.fit(X_train, y_train)

# Evaluate model
print(model3.score(X_train, y_train))

# Use model to make predictions
predictions1 = lm1.predict(X_test)

# Calculate coeffcient and intercept
coefficients1 = model3.coef_
intercepts1 = model3.intercept_
0.5149764194147157
In [89]:
# Use statsmodel
X = sm.add_constant(X)
model4 = sm.OLS(y_train,X_train).fit()
print (model4.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:          nonViolPerPop   R-squared (uncentered):                   0.879
Model:                            OLS   Adj. R-squared (uncentered):              0.877
Method:                 Least Squares   F-statistic:                              729.6
Date:                Fri, 09 Oct 2020   Prob (F-statistic):                        0.00
Time:                        17:20:23   Log-Likelihood:                         -12842.
No. Observations:                1426   AIC:                                  2.571e+04
Df Residuals:                    1412   BIC:                                  2.579e+04
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
PctLess9thGrade       -16.9689     30.665     -0.553      0.580     -77.123      43.185
PctNotHSGrad            6.8982     22.874      0.302      0.763     -37.972      51.768
PctBSorMore           -10.3915      8.121     -1.280      0.201     -26.322       5.539
PctUnemployed          78.0013     35.774      2.180      0.029       7.826     148.176
PctEmploy             -13.7820      9.554     -1.443      0.149     -32.524       4.960
PctHousOccup          -28.4605     10.920     -2.606      0.009     -49.882      -7.039
PctHousOwnOcc         -28.6547      4.683     -6.119      0.000     -37.840     -19.469
PctVacantBoarded        0.6076     20.845      0.029      0.977     -40.282      41.497
PctVacMore6Mos        -18.1311      4.731     -3.833      0.000     -27.411      -8.851
racepctblack          102.3185     11.628      8.799      0.000      79.508     125.129
racePctWhite           92.7819      9.972      9.304      0.000      73.221     112.343
racePctAsian           70.7048     17.214      4.107      0.000      36.938     104.472
racePctHisp            31.1728      7.534      4.138      0.000      16.395      45.951
ViolentCrimesPerPop     2.5173      0.136     18.508      0.000       2.250       2.784
==============================================================================
Omnibus:                      802.609   Durbin-Watson:                   2.028
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            22170.196
Skew:                           2.082   Prob(JB):                         0.00
Kurtosis:                      21.862   Cond. No.                         610.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]: